Kaldi 是由語音辨識領域中的神級人物 - Dan Povey 所開發出來一套以 C++
撰寫的開源工具,使用上則是會以 shell script
為主,主要用於語音辨識與語音訊號處理。 Kaldi 有關的3個主要網站如下:
要安裝 Kaldi 環境的話首先先將 github 專案 clone 下來,接著進到 tools/
目錄依序執行以下指令:
git clone https://github.com/kaldi-asr/kaldi.git
extras/check_dependencies.sh
CXX=g++-4.8 extras/check_dependencies.sh
make
# or run by multiple CPUs (4 for example)
make -j 4
接著進到 src/
目錄下執行以下指令
./configure --shared
# run in parallel by multiple CPUs (4 for example)
make depend -j 4
make -j 4
基本上就是依照 INSTALL
中所說明的安裝流程執行。
整個 github 專案中包含了許多目錄及檔案,這邊會根據官方文件的建議及自己使用上的經驗列出一些比較需要知道與常用到的。
接下來我們會根據 formosa 這一個範例來做修改實際看看 kaldi 的運作流程~
首先先來看主程式 run.sh
的部份,run.sh 裡面主要包含了以下幾個步驟:
發音詞典處理 (local/prepare_dict.sh)
資料處理 (local/prepare_data.sh)
訓練神經網路模型 (local/chain/run_tdnn.sh)
stage=-2
num_jobs=16
train_dir=<train-data-path>
eval_dir=<eval-data-path>
# shell options
set -eo pipefail
. ./cmd.sh
. ./utils/parse_options.sh
#configure number of jobs running in parallel, you should adjust these numbers according to your machines
# data preparation
if [ $stage -le -2 ]; then
# Lexicon Preparation,
echo "$0: Lexicon Preparation"
local/prepare_dict.sh || exit 1;
# Data Preparation
echo "$0: Data Preparation"
local/prepare_data.sh --train-dir $train_dir || exit 1;
# Phone Sets, questions, L compilation
echo "$0: Phone Sets, questions, L compilation Preparation"
rm -rf data/lang
utils/prepare_lang.sh --position-dependent-phones false data/local/dict \
"<SIL>" data/local/lang data/lang || exit 1;
# LM training
echo "$0: LM training"
rm -rf data/local/lm/3gram-mincount
local/train_lms.sh || exit 1;
# G compilation, check LG composition
echo "$0: G compilation, check LG composition"
utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \
data/local/dict/lexicon.txt data/lang_test || exit 1;
fi
# Now make MFCC plus pitch features.
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
mfccdir=mfcc
# mfcc
if [ $stage -le -1 ]; then
echo "$0: making mfccs"
for x in train test ; do
steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $num_jobs data/$x exp/make_mfcc/$x $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
utils/fix_data_dir.sh data/$x || exit 1;
done
fi
# mono
if [ $stage -le 0 ]; then
echo "$0: train mono model"
# Make some small data subsets for early system-build stages.
echo "$0: make training subsets"
utils/subset_data_dir.sh --shortest data/train 3000 data/train_mono
# train mono
steps/train_mono.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
data/train_mono data/lang exp/mono || exit 1;
# Get alignments from monophone system.
steps/align_si.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
data/train data/lang exp/mono exp/mono_ali || exit 1;
# Monophone decoding
(
utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
exp/mono/graph data/test exp/mono/decode_test
)&
fi
# tri1
if [ $stage -le 1 ]; then
echo "$0: train tri1 model"
# train tri1 [first triphone pass]
steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
# align tri1
steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
# decode tri1
(
utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
exp/tri1/graph data/test exp/tri1/decode_test
)&
fi
# tri2
if [ $stage -le 2 ]; then
echo "$0: train tri2 model"
# train tri2 [delta+delta-deltas]
steps/train_deltas.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
# align tri2b
steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
# decode tri2
(
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
exp/tri2/graph data/test exp/tri2/decode_test
)&
fi
# tri3a
if [ $stage -le 3 ]; then
echo "$-: train tri3 model"
# Train tri3a, which is LDA+MLLT,
steps/train_lda_mllt.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
# decode tri3a
(
utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
steps/decode.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
exp/tri3a/graph data/test exp/tri3a/decode_test
)&
fi
# tri4
if [ $stage -le 4 ]; then
echo "$0: train tri4 model"
# From now, we start building a more serious system (with SAT), and we'll
# do the alignment with fMLLR.
steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
steps/train_sat.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
# align tri4a
steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
data/train data/lang exp/tri4a exp/tri4a_ali
# decode tri4a
(
utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
exp/tri4a/graph data/test exp/tri4a/decode_test
)&
fi
# tri5
if [ $stage -le 5 ]; then
echo "$0: train tri5 model"
# Building a larger SAT system.
steps/train_sat.sh --cmd "$train_cmd" \
3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
# align tri5a
steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
# decode tri5
(
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
)&
fi
# nnet3 tdnn models
# commented out by default, since the chain model is usually faster and better
#if [ $stage -le 6 ]; then
# echo "$0: train nnet3 model"
# local/nnet3/run_tdnn.sh
#fi
# chain model
if [ $stage -le 7 ]; then
# The iVector-extraction and feature-dumping parts coulb be skipped by setting "--train_stage 7"
echo "$0: train chain model"
local/chain/run_tdnn.sh
fi
# getting results (see RESULTS file)
if [ $stage -le 8 ]; then
echo "$0: extract the results" |& tee -a RETRAIN_RESULTS
for test_set in test ; do
echo "WER: $test_set" |& tee -a RETRAIN_RESULTS
for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done |& tee -a RETRAIN_RESULTS
for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done |& tee -a RETRAIN_RESULTS
echo |& tee -a RETRAIN_RESULTS
echo "CER: $test_set" |& tee -a RETRAIN_RESULTS
for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done |& tee -a RETRAIN_RESULTS
for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done |& tee -a RETRAIN_RESULTS
echo |& tee -a RETRAIN_RESULTS
done
fi
# finish
echo "$0: all done"
exit 0;
如果每一個步驟都詳細說明的話要花好幾天的時間,所以明天會著重在發音詞典處理、資料處理、訓練神經網路模型
這三個部分的細節說明。
參考資料: